library(tidyverse)

rs67 <- read_csv('373507-SampleGenotypes-Homo_sapiens_Variation_Sample_rs67841474.csv')
rs41 <- read_csv('373507-SampleGenotypes-Homo_sapiens_Variation_Sample_rs41293539.csv')

rs67.2 <- rs67 |>
  mutate(sample = `Sample (Male/Female/Unknown)`,
         geno_gg = `Genotype (forward strand)`, pop = `Population(s)`,
         .keep = 'none')

rs41.2 <- rs41 |>
  mutate(sample = `Sample (Male/Female/Unknown)`,
         geno_ins = `Genotype (forward strand)`, .keep = 'none')

allele67.2 <- rs67.2 |>
  separate(geno_gg, into = c('strand1', 'strand2')) |>
  pivot_longer(2:3, names_to = 'strand', values_to = 'allele_gg')

allele41.2 <- rs41.2 |>
  separate(geno_ins, into = c('strand1', 'strand2')) |>
  pivot_longer(2:3, names_to = 'strand', values_to = 'allele_ins')

mica5.1 <- allele41.2 |>
  left_join(allele67.2) |>
  mutate(pop = str_remove(pop, 'ALL'),
         pop5 = str_extract(pop, 'EAS|SAS|EUR|AFR|AMR'),
         subpop = str_remove(pop, 'EAS|SAS|EUR|AFR|AMR') |> str_extract('[A-Z]{3}'))

mica5.1 |>
  mutate(ins.gg = (allele_ins == '' & allele_gg == 'GG')) |>
  summarise(maf.gg = sum(ins.gg) / n(), .by = subpop) |>
  ggplot(aes(fct_reorder(subpop, maf.gg), maf.gg)) +
  geom_col()

mica5.1 |>
  mutate(ins.gg = (allele_ins == '' & allele_gg == 'GG')) |>
  summarise(maf.gg = sum(ins.gg) / n(), .by = pop5) |>
  ggplot(aes(fct_reorder(pop5, maf.gg), maf.gg)) +
  geom_col()
